\appendix
\chapter{Jaccard Vs. FSSim}
\label{AppendixA}
\lhead{Appendix A. \emph{Jaccard Vs. FSSim}}
 


Amos Tversky \citep{tversky1977features} proposed the \textit{ratio model} as a measure of similarity between two sets A and B.  The ratio model is given below:

\begin{equation}
%\footnotesize
S(A,B)=\frac {|A \cap B|}{|A \cap B| +\alpha  |A-B| + \beta  |B-A|}, \alpha , \beta \geq 0
\label{eq:ratiomodel}
\end{equation}
The parameters $\alpha$ and $\beta$ balance the weight of the differences in the equation. This equation normalizes the similarity so that $S$ is between 0 and 1. We can show that this model generalizes several set-theoretical models of similarity proposed in literature. If $\alpha = \beta = 1$ it reduces to the Jaccard coefficient, i.e.\  $Jaccard(A,B)= \frac {|A \cap B|}{|A \cup B|}$:

\begin{proof}
\label{eq:proof1}
Replacing $\alpha = \beta = 1$ in E.q. \ref{eq:ratiomodel}, we have $S(A,B)=\frac {|A \cap B|}{|A \cap B| +  |A-B| + |B-A|}$. Then, using the identity $|AUB| = |A \cap B| +  |A-B| + |B-A|$, we have: $\frac {|A \cap B|}{|A \cap B| +  |A-B| + |B-A|} = \frac {|A \cap B|}{|A \cup B|} = Jaccard(A,B)$
\end{proof}

It is easy to show that when FSSim is replaced by Jaccard it violates the Theorem \ref{theorem:t1}.
\begin{proof}
By counterexample: Let $|A \cap B| = 20$ and $|A \cap C|= 10$, and let $|A \cup B| = 40$ and $|A \cup C|= 20$. Then
$|A\cap B| > |A \cap C|$ but $\frac {|A \cap B|}{|A \cup B|} = \frac {|A \cap C|}{|A \cup C|} \Rightarrow \frac{20}{40} = \frac{10}{20} \Rightarrow  Jaccard(A,B) = Jaccard (A,C)$.
\end{proof}

Now we proof that Theorem \ref{theorem:t1} is valid for FSSim(A,B).
\begin{lemma}
If $|A \cap B| > 0$ then $\frac{|A-B|+|B-A|}{2|A\cup B|} < 1$
\label{lemma:lemma2}
\end{lemma}
\begin{proof}
Proof of Lemma \ref{lemma:lemma2}: If $|A \cap B| > 0$ then $|A-B|+|B-A| <  |A \cap B| + |A-B|+|B-A| < 2(|A \cap B| + |A-B|+|B-A|).$ Applying identity mentioned in Proof \ref{eq:proof1}, we have: $|A-B|+|B-A| <  2|A\cup B| \Rightarrow \frac{|A-B|+|B-A|}{2|A\cup B|} < 1$
\end{proof}

\begin{proof}
Proof of Theorem \ref{theorem:t1}: If $|A\cap B| > |A \cap C|$ then $|A \cap B| > 0$.  Let a positive integer $\delta < 1$ and $\omega <1$, then $|A\cap B| > |A \cap C| + (\delta  - \omega ) \Rightarrow |A\cap B| - \delta > |A \cap C|  - \omega $. By Lemma \ref{lemma:lemma2} $\delta = \frac{|A-B|+|B-A|}{2|A\cup B|} <1 $ and $ \omega = \frac{|A-C|+|C-A|}{2|A\cup C|} < 1$, then $|A\cap B| - \frac{|A-B|+|B-A|}{2|A\cup B|}  > |A \cap C| - \frac{|A-C|+|C-A|}{2|A\cup C|} \Rightarrow  FSSim(A,B) > FSSim(A,C)$.
\end{proof}

%\section{Set Reduction}
%In the section we proof the Theorem \ref{theorem:nphard}.
%\begin{proof} We prove that the problem is NP-hard using a reduction from set cover problem. Given a set of elements $\mathcal{U}=\{u_1, u_2, \dots, u_n\}$ and collection of sets $\mathcal{T} = \{T_1, T_2,\dots, T_m\}$, where $\forall i: T_i \in 2^\mathcal{U}$, the set cover problem is to identify the smallest number of sets $\mathcal{X}$ in $\mathcal{T} $ whose union still contains all elements in the universe $\mathcal{U}$. In our problem $\mathcal{T} = C(S)$, $\mathcal{X} = C(S^*)$ and an small variant of the set cover problem is that our universe  is represented by the distribution of the features of the set $C(S)$, i.e., $\mathcal{U}=Pr(F(C(S)))$. Our goal is still to find a smallest set $C(S^*)$ so that the features $F(C(S^*))$ occur in the same frequency than the features in $F(C(S))$. In other words, the only difference of the set cover problem is that instead of counting the coverage of the elements in $\mathcal{U}$, we consider a function over those elements (e.g. $Pr(X)$). Our problem itself is a set cover problem. 
%\end{proof}

